# Import libraries 
library(ggplot2)
library(dplyr)
library(lattice)
library(tidyverse)
library(lubridate)
library(fpp3)
library(fable)
library(fabletools)
library(feasts)
library(corrplot)
library(forecast)
library(tsibble)
library(zoo)
library(future)
library(tseries)
library(bookdown)
library(Metrics)
library(wavelets)
library(randomForest)
library(xgboost)
library(caret)
library(fable)

1 Introduction

This report provides a detailed analysis of predicting electricity demand in New South Wales (NSW) using advanced statistical and modeling approaches to enhance precision. It examines data from January 2010 to March 2021, featuring half-hourly figures on electricity usage and temperature. The objective is to develop and refine models for better forecasting accuracy.

2 Data Preperation

2.1 Input of Project Shared Data

dfTemp <- read_csv(unzip("../data/NSW/temperature_nsw.csv.zip", files = "temperature_nsw.csv", exdir = tempdir())[1])
dfDemand <- read_csv(unzip("../data/NSW/totaldemand_nsw.csv.zip", files = "totaldemand_nsw.csv", exdir = tempdir())[1])

2.2 Data Extraction from Github

# Since the original forecastdemand file is too large, it is divided into 5 parts
forecastDemand1 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part1.csv.zip", files = "forecastdemand_part1.csv", exdir = tempdir())[1])
forecastDemand2 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part2.csv.zip", files = "forecastdemand_part2.csv", exdir = tempdir())[1])
forecastDemand3 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part3.csv.zip", files = "forecastdemand_part3.csv", exdir = tempdir())[1])
forecastDemand4 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part4.csv.zip", files = "forecastdemand_part4.csv", exdir = tempdir())[1])
forecastDemand5 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part5.csv.zip", files = "forecastdemand_part5.csv", exdir = tempdir())[1])

2.2.0.1 Combination of 5 forecastDemand files into 1 new dataframe, forecastDemandAll

dfForecastDemand <- bind_rows(forecastDemand1, forecastDemand2, forecastDemand3, forecastDemand4,forecastDemand5) 

2.2.1 Import External Data

## Reference: https://www.michaelplazzer.com/datasets/australian-public-holiday-data/

dfHoliday <- read_csv('../data/Aus_public_hols_2009-2022-1.csv')
dfHoliday <- dfHoliday %>% 
  filter(State == 'NSW')

2.3 Data Cleaning

2.3.1 NA Values Detection

2.3.1.1 Define NA Value Counting Function

countNA <- function(dataFrame) {
  # Calculate the total number of NA values in the dataframe
  totalNA <- sum(is.na(dataFrame))
  
  # Check if there are any NA values
  if (totalNA > 0) {
    cat("True\n")
    cat("Total number of NA values:", totalNA, "\n")
  } else {
    cat("False\n")
    cat("No NA values present.\n")
  }
}

2.3.2 Count NA Values for Dataframes

countNA(dfDemand)
## False
## No NA values present.
countNA(dfTemp)
## False
## No NA values present.
countNA(dfForecastDemand)
## False
## No NA values present.
countNA(dfHoliday)
## False
## No NA values present.
# Check for duplicates
duplicates <- duplicated(dfTemp)

# View the rows that are duplicates
dfTemp[duplicates, ]
## # A tibble: 13 × 3
##    LOCATION  DATETIME         TEMPERATURE
##    <chr>     <chr>                  <dbl>
##  1 Bankstown 1/1/2011 0:00           21  
##  2 Bankstown 10/10/2011 10:30        18.9
##  3 Bankstown 10/10/2011 18:30        16.1
##  4 Bankstown 10/10/2011 19:30        15.5
##  5 Bankstown 1/1/2012 0:00           15.4
##  6 Bankstown 1/1/2013 0:00           21  
##  7 Bankstown 1/1/2014 0:00           20.4
##  8 Bankstown 1/1/2015 0:00           20.9
##  9 Bankstown 1/1/2016 0:00           16.9
## 10 Bankstown 1/1/2017 0:00           22.6
## 11 Bankstown 1/1/2018 0:00           22.4
## 12 Bankstown 1/1/2019 0:00           22.3
## 13 Bankstown 1/1/2020 0:00           19.4

3 Exploratory Data Analysis

3.1 Univariate Analysis

3.1.1 Summary Statistic of Electricity Demand in NSW

summary(dfDemand)
##    DATETIME          TOTALDEMAND      REGIONID        
##  Length:196513      Min.   : 5075   Length:196513     
##  Class :character   1st Qu.: 7150   Class :character  
##  Mode  :character   Median : 8053   Mode  :character  
##                     Mean   : 8113                     
##                     3rd Qu.: 8959                     
##                     Max.   :14580

3.1.2 Define Function for DATETIME data

3.1.3 Define function for DATETIME to datetime object

convertDateTime <- function(df, dateTimeCol = "DATETIME", timeZone = "Australia/Brisbane") {
  # Check for the presence of the specified datetime column
  if (!dateTimeCol %in% names(df)) {
    stop(paste(dateTimeCol, "column not found in the dataframe."))
  }
  
  # Convert the datetime column to a datetime object
  df[[dateTimeCol]] <- dmy_hm(df[[dateTimeCol]], tz = timeZone)
  
  return(df)
}

3.1.4 Define function for Add Time Attributes from Datetime

addTimeAttr <- function(df, dateTimeCol = "DATETIME") {
  # Check if the datetime column exists and is of POSIXct type
  if (!dateTimeCol %in% names(df) || !inherits(df[[dateTimeCol]], "POSIXct")) {
    stop(paste(dateTimeCol, "column is not properly formatted as datetime in the dataframe."))
  }
  # Add derived time attributes
  df <- df %>%
    mutate(
      yearValue = year(.data[[dateTimeCol]]),
      monthValue = month(.data[[dateTimeCol]]),
      dayValue = day(.data[[dateTimeCol]]),
      hourValue = hour(.data[[dateTimeCol]]),
      minuteValue = minute(.data[[dateTimeCol]]),
      timeOfDay = hour(.data[[dateTimeCol]]) + minute(.data[[dateTimeCol]]) / 60,
      weekOfMonth = factor(
        week(.data[[dateTimeCol]]) - week(floor_date(.data[[dateTimeCol]], "month")) + 1,
        levels = 1:6  # Levels for weeks in a month
      ),
      dayOfWeek = factor(
        wday(.data[[dateTimeCol]], label = TRUE),
        levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")
      )
    )
  
  return(df)
}

3.1.4.1 Convert DataTime into Timestampe & Check the Result

dfDemand <- convertDateTime(dfDemand)
head(dfDemand) 
## # A tibble: 6 × 3
##   DATETIME            TOTALDEMAND REGIONID
##   <dttm>                    <dbl> <chr>   
## 1 2010-01-01 00:00:00       8038  NSW1    
## 2 2010-01-01 00:30:00       7809. NSW1    
## 3 2010-01-01 01:00:00       7484. NSW1    
## 4 2010-01-01 01:30:00       7117. NSW1    
## 5 2010-01-01 02:00:00       6812. NSW1    
## 6 2010-01-01 02:30:00       6544. NSW1

3.1.5 Add datetime addtribute for dfDemand

dfDemand <- addTimeAttr(dfDemand)

3.1.6 Add Time attributes for dfDemand

dfDemand <- addTimeAttr(dfDemand)
head(dfDemand) 
## # A tibble: 6 × 11
##   DATETIME            TOTALDEMAND REGIONID yearValue monthValue dayValue
##   <dttm>                    <dbl> <chr>        <dbl>      <dbl>    <int>
## 1 2010-01-01 00:00:00       8038  NSW1          2010          1        1
## 2 2010-01-01 00:30:00       7809. NSW1          2010          1        1
## 3 2010-01-01 01:00:00       7484. NSW1          2010          1        1
## 4 2010-01-01 01:30:00       7117. NSW1          2010          1        1
## 5 2010-01-01 02:00:00       6812. NSW1          2010          1        1
## 6 2010-01-01 02:30:00       6544. NSW1          2010          1        1
## # ℹ 5 more variables: hourValue <int>, minuteValue <int>, timeOfDay <dbl>,
## #   weekOfMonth <fct>, dayOfWeek <ord>

3.1.6.1 Remove ‘REGIOINID’ column from dfDemand

dfDemand <- dfDemand %>%
  select(-'REGIONID')
head(dfDemand)
## # A tibble: 6 × 10
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00       8038       2010          1        1         0
## 2 2010-01-01 00:30:00       7809.      2010          1        1         0
## 3 2010-01-01 01:00:00       7484.      2010          1        1         1
## 4 2010-01-01 01:30:00       7117.      2010          1        1         1
## 5 2010-01-01 02:00:00       6812.      2010          1        1         2
## 6 2010-01-01 02:30:00       6544.      2010          1        1         2
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>

3.1.7 Electricity Demand

3.1.7.1 Yearly Electricity Demand in NSW

# Calculate sequence of years for demarcation
yearSeq <- seq(from = floor_date(min(dfDemand$DATETIME), "year"),
                    to = ceiling_date(max(dfDemand$DATETIME), "year"),
                    by = "1 year")

# Enhanced visual plot using ggplot2
dfDemand %>%
  ggplot(aes(x = DATETIME, y = TOTALDEMAND)) +
  geom_line(color = "deepskyblue4", linewidth = 1) +  
  geom_vline(data = data.frame(x = yearSeq), 
             aes(xintercept = as.numeric(x)), 
             linetype = "dashed", color = "firebrick1") +  
  scale_x_datetime(date_breaks = '1 year', date_labels = '%Y') +
  labs(x = "Year", y = "Electricity Demand", 
       title = "Yearly Electricity Demand in NSW") +
  theme_light(base_size = 10) +  
  theme(plot.title = element_text(hjust = 0.5, face="bold"), 
        axis.title.x = element_text(face="italic"), 
        axis.title.y = element_text(face="italic")) 

3.1.7.2 Hourly Electricity Demand by Season

# Create a new column 'Season' according to the month
dfDemandSeason <- dfDemand %>%
  mutate(Season = case_when(
    monthValue %in% c(9, 10, 11) ~ "Spring",
    monthValue %in% c(12, 1, 2) ~ "Summer",
    monthValue %in% c(3, 4, 5) ~ "Autumn",
    monthValue %in% c(6, 7, 8) ~ "Winter",
    TRUE ~ NA_character_))

dfDemandSeason$hourValue <- factor(dfDemandSeason$hourValue, levels = seq(0, 23))

# Plot with adjusted y-axis labels
bwplot(TOTALDEMAND ~ hourValue | Season, data = dfDemandSeason,
       layout = c(2, 2),
       xlab = "Hour of the Day", ylab = "Total Demand",
       main = "Hourly Electricity Demand by Season",
       scales = list(
         x = list( cex = 0.5),
         y = list(rot = 90, cex = 0.7)  
       ))

3.1.8 Summary Statistics of Air Temperature

summary(dfTemp)
##    LOCATION           DATETIME          TEMPERATURE   
##  Length:220326      Length:220326      Min.   :-1.30  
##  Class :character   Class :character   1st Qu.:13.40  
##  Mode  :character   Mode  :character   Median :17.70  
##                                        Mean   :17.42  
##                                        3rd Qu.:21.30  
##                                        Max.   :44.70
# Remove 'LOCATION' column from dfTemp
dfTemp <- dfTemp %>% select(-'LOCATION')
head(dfTemp)
## # A tibble: 6 × 2
##   DATETIME      TEMPERATURE
##   <chr>               <dbl>
## 1 1/1/2010 0:00        23.1
## 2 1/1/2010 0:01        23.1
## 3 1/1/2010 0:30        22.9
## 4 1/1/2010 0:50        22.7
## 5 1/1/2010 1:00        22.6
## 6 1/1/2010 1:30        22.5

3.1.9 Convert DateTime Value for Temperature

dfTemp <- convertDateTime(dfTemp)
head(dfTemp)
## # A tibble: 6 × 2
##   DATETIME            TEMPERATURE
##   <dttm>                    <dbl>
## 1 2010-01-01 00:00:00        23.1
## 2 2010-01-01 00:01:00        23.1
## 3 2010-01-01 00:30:00        22.9
## 4 2010-01-01 00:50:00        22.7
## 5 2010-01-01 01:00:00        22.6
## 6 2010-01-01 01:30:00        22.5
dfTemp <- addTimeAttr((dfTemp))
head(dfTemp)
## # A tibble: 6 × 10
##   DATETIME            TEMPERATURE yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00        23.1      2010          1        1         0
## 2 2010-01-01 00:01:00        23.1      2010          1        1         0
## 3 2010-01-01 00:30:00        22.9      2010          1        1         0
## 4 2010-01-01 00:50:00        22.7      2010          1        1         0
## 5 2010-01-01 01:00:00        22.6      2010          1        1         1
## 6 2010-01-01 01:30:00        22.5      2010          1        1         1
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>
summary(dfTemp)
##     DATETIME                       TEMPERATURE      yearValue   
##  Min.   :2010-01-01 00:00:00.00   Min.   :-1.30   Min.   :2010  
##  1st Qu.:2012-11-02 23:07:30.00   1st Qu.:13.40   1st Qu.:2012  
##  Median :2015-08-16 13:45:00.00   Median :17.70   Median :2015  
##  Mean   :2015-08-16 07:59:12.96   Mean   :17.42   Mean   :2015  
##  3rd Qu.:2018-06-03 14:22:30.00   3rd Qu.:21.30   3rd Qu.:2018  
##  Max.   :2021-03-18 00:00:00.00   Max.   :44.70   Max.   :2021  
##                                                                 
##    monthValue        dayValue       hourValue      minuteValue   
##  Min.   : 1.000   Min.   : 1.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.: 5.00   1st Qu.: 0.00  
##  Median : 6.000   Median :16.00   Median :11.00   Median :29.00  
##  Mean   : 6.423   Mean   :15.66   Mean   :11.41   Mean   :16.51  
##  3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:17.00   3rd Qu.:30.00  
##  Max.   :12.000   Max.   :31.00   Max.   :23.00   Max.   :59.00  
##                                                                  
##    timeOfDay     weekOfMonth dayOfWeek  
##  Min.   : 0.00   1:28907     Sun:31373  
##  1st Qu.: 5.50   2:51106     Mon:31301  
##  Median :11.50   3:50611     Tue:31796  
##  Mean   :11.69   4:50803     Wed:31736  
##  3rd Qu.:17.50   5:36736     Thu:31356  
##  Max.   :23.98   6: 2163     Fri:31357  
##                              Sat:31407

Since the temperature data updates occur at irregular minute intervals and the electricity demand data are recording every 30 minutes, temperature data with minuteValue other than 0 and 30 are removed.

3.1.10 Remove rows where minute are not equal to 0 or 30

# Remove rows from dfTemp where minuteValue is not 0 or 30 and store the result in dfTemp
dfTemp <- dfTemp %>%
  filter(minuteValue == 0 | minuteValue == 30)

# , print the resulting dataframe to verify the rows have been filtered correctly
summary(dfTemp)
##     DATETIME                       TEMPERATURE      yearValue   
##  Min.   :2010-01-01 00:00:00.00   Min.   :-1.30   Min.   :2010  
##  1st Qu.:2012-10-24 01:15:00.00   1st Qu.:13.50   1st Qu.:2012  
##  Median :2015-08-10 17:30:00.00   Median :17.90   Median :2015  
##  Mean   :2015-08-12 09:19:17.21   Mean   :17.53   Mean   :2015  
##  3rd Qu.:2018-06-01 10:15:00.00   3rd Qu.:21.50   3rd Qu.:2018  
##  Max.   :2021-03-18 00:00:00.00   Max.   :44.70   Max.   :2021  
##                                                                 
##    monthValue        dayValue       hourValue     minuteValue   timeOfDay    
##  Min.   : 1.000   Min.   : 1.00   Min.   : 0.0   Min.   : 0   Min.   : 0.00  
##  1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.: 6.0   1st Qu.: 0   1st Qu.: 6.00  
##  Median : 6.000   Median :16.00   Median :12.0   Median : 0   Median :12.00  
##  Mean   : 6.437   Mean   :15.69   Mean   :11.5   Mean   :15   Mean   :11.75  
##  3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:18.0   3rd Qu.:30   3rd Qu.:18.00  
##  Max.   :12.000   Max.   :31.00   Max.   :23.0   Max.   :30   Max.   :23.50  
##                                                                              
##  weekOfMonth dayOfWeek  
##  1:25543     Sun:28003  
##  2:45289     Mon:27959  
##  3:45027     Tue:28005  
##  4:44897     Wed:28044  
##  5:33225     Thu:27956  
##  6: 1966     Fri:27992  
##              Sat:27988
dfTemp$DATETIME <- as.POSIXct(dfTemp$DATETIME)

# Create a sequence of years from min to max year in 'DATETIME'
yearBreaks <- seq(from = floor_date(min(dfTemp$DATETIME), "year"),
                  to = ceiling_date(max(dfTemp$DATETIME), "year"),
                  by = "1 year")

# Determine the exact positions for x-axis labels
labelPositions <- as.numeric(yearBreaks)

# Plot using lattice with lines instead of points and adjusted x-axis
temperaturePlot <- xyplot(TEMPERATURE ~ DATETIME, data = dfTemp,
                          panel = function(x, y, ...) {
                            # Plot data as a line
                            panel.xyplot(x, y, type = "l", ...)
                            # Add vertical lines for each year
                            panel.abline(v = yearBreaks, col = "red", lty = 2)
                          },
                          scales = list(x = list(at = labelPositions, labels = format(yearBreaks, "%Y"),
                                                 rot = 0)), # Labels are horizontal
                          xlab = "Year",
                          ylab = "Temperature",
                          main = "Yearly Temperature Vairation in NSW")

# Print the plot
print(temperaturePlot)
Temperature Over Time.

Figure 3.1: Temperature Over Time.

The line graph showcasing temperature variations in New South Wales from 2010 to 2021 vividly illustrates a pronounced seasonal pattern, with temperature fluctuations that consistently repeat each year throughout the observed period. Each year within the dataset, temperatures in NSW ascend during the warmer months and descend during the cooler periods, showcasing a clear and predictable cycle of seasonal temperature changes. This regularity is marked by similar patterns emerging year after year, underscoring the stable and predictable nature of seasonal influences on the climate of NSW.

3.1.11 Forecast Demand Data

When dealing with multiple forecast data points for each timestamp, taking the median value is an effective method to stabilize and enhance the reliability of the data. Each timestamp may have varying forecasts due to different modeling assumptions. By calculating the median, the discrepancies is , providing a more consistent and robust estimate of demand. This approach helps in reducing noise and ensures that the predictions are less influenced by outliers or extreme variations, making the forecast data more dependable for decision-making and analysis.

# Group the data by 'DATETIME', then calculate the median of 'FORECASTDEMAND' for each group
dfForecast <- dfForecastDemand %>%
  group_by(DATETIME) %>%                    # Group data by the 'DATETIME' column
  summarise(MedianForecastDemand = median(FORECASTDEMAND, na.rm = TRUE))  # Calculate median, removing NA values

# View the new dataframe
head(dfForecast)
## # A tibble: 6 × 2
##   DATETIME            MedianForecastDemand
##   <dttm>                             <dbl>
## 1 2010-01-01 00:00:00                7821.
## 2 2010-01-01 00:30:00                7709.
## 3 2010-01-01 01:00:00                7483.
## 4 2010-01-01 01:30:00                7113.
## 5 2010-01-01 02:00:00                6776.
## 6 2010-01-01 02:30:00                6523.

3.1.12 Summary Statistics of Forecast Demand in NSW

summary(dfForecast)
##     DATETIME                   MedianForecastDemand
##  Min.   :2010-01-01 00:00:00   Min.   : 4835       
##  1st Qu.:2012-10-20 12:00:00   1st Qu.: 7123       
##  Median :2015-08-10 00:00:00   Median : 8059       
##  Mean   :2015-08-10 00:00:00   Mean   : 8105       
##  3rd Qu.:2018-05-29 12:00:00   3rd Qu.: 8967       
##  Max.   :2021-03-18 00:00:00   Max.   :14583

The ‘dfForecast’ dataset provides a summary of forecasted demand in NSW from January 1, 2010, to March 18, 2021. The date distribution is centered around August 10, 2015, indicating a balanced spread over the analyzed period. Forecasted demand ranges from 4,835 to 14,583, with a median of 8,059 and a mean very close at 8,105, suggesting a relatively symmetric distribution around the central values. The quartile figures, with the first quartile at 7,123 and the third at 8,967, highlight the variability in forecasted demands. This data is crucial for understanding trends and making informed decisions in resource management and planning based on anticipated demand.

3.2 Bivariate Analysis

head(dfDemand)
## # A tibble: 6 × 10
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00       8038       2010          1        1         0
## 2 2010-01-01 00:30:00       7809.      2010          1        1         0
## 3 2010-01-01 01:00:00       7484.      2010          1        1         1
## 4 2010-01-01 01:30:00       7117.      2010          1        1         1
## 5 2010-01-01 02:00:00       6812.      2010          1        1         2
## 6 2010-01-01 02:30:00       6544.      2010          1        1         2
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>
head(dfHoliday)
## # A tibble: 6 × 5
##   Date       State Weekday  Day_In_Lieu Part_Day_From
##   <date>     <chr> <chr>          <dbl>         <dbl>
## 1 2009-01-01 NSW   Thursday           0             0
## 2 2009-01-26 NSW   Monday             0             0
## 3 2009-04-10 NSW   Friday             0             0
## 4 2009-04-11 NSW   Saturday           0             0
## 5 2009-04-12 NSW   Sunday             0             0
## 6 2009-04-13 NSW   Monday             0             0

3.2.1 Effect of Holidays on Electricity Demand in NSW

dfDemand <- dfDemand %>%
  mutate(IsHoliday = case_when(
    date(DATETIME) %in% dfHoliday$Date ~ 'Holiday', TRUE ~ 'Non-Holiday'
  )) %>%
  mutate(Date = as.Date(DATETIME))

# Create a violin plot to compare electricity demand on holidays vs non-holidays
ggplot(dfDemand, aes(x = IsHoliday, y = TOTALDEMAND, fill = IsHoliday)) +
  geom_violin(trim = FALSE) +
  labs(title = "Electricity Demand Comparison on Holidays vs Non-Holidays",
       x = "", y = "Total Demand (MW)") +
  scale_fill_manual(values = c("Holiday" = "red", "Non-Holiday" = "blue")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

The violin plot illustrates the differences in electricity demand between holidays and non-holidays. It shows that the median demand on holidays tends to be lower compared to non-holidays. Furthermore, this plot provides insight into the overall distribution and highlights several outliers that fall outside the common range for both holiday and non-holiday periods.

head(dfDemand)
## # A tibble: 6 × 12
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00       8038       2010          1        1         0
## 2 2010-01-01 00:30:00       7809.      2010          1        1         0
## 3 2010-01-01 01:00:00       7484.      2010          1        1         1
## 4 2010-01-01 01:30:00       7117.      2010          1        1         1
## 5 2010-01-01 02:00:00       6812.      2010          1        1         2
## 6 2010-01-01 02:30:00       6544.      2010          1        1         2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>, IsHoliday <chr>, Date <date>

3.2.2 Correlation of Electricity Demand vs Temperature

# Merge the dfDemand and dfTemp based on 'DATETIME'
dfDT <- inner_join(dfDemand, dfTemp, by = "DATETIME")

# Calculate Pearson's correlation coefficient
corrResult <- cor(dfDT$TOTALDEMAND, dfDT$TEMPERATURE, method = "pearson")

# Print the correlation coefficient
print(corrResult)
## [1] 0.1490452
# Perform a significance test
corrTestResult <- cor.test(dfDT$TOTALDEMAND, dfDT$TEMPERATURE, method = "pearson")
print(corrTestResult)
## 
##  Pearson's product-moment correlation
## 
## data:  dfDT$TOTALDEMAND and dfDT$TEMPERATURE
## t = 66.721, df = 195945, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1447130 0.1533717
## sample estimates:
##       cor 
## 0.1490452

3.2.2.1 Pearson’s Correlation

Using Pearson’s correlation to assess the relationship between electricity demand and temperature in NSW is well-suited due to both variables being continuous and the assumption that they share a linear relationship.

The Pearson correlation analysis between electricity demand (‘TOTALDEMAND’) and temperature (‘TEMPERATURE’) using the dataset ‘dfMerged’ produced a correlation coefficient (‘cor’) of 0.149. This positive correlation indicates that as the temperature increases, there is a slight upward trend in electricity demand. The results of the Pearson’s test were highly statistically significant, as indicated by an extremely small p-value (less than 2.2e-16), strongly rejecting the null hypothesis that there is no correlation between these two variables.

The confidence interval for this correlation, ranging from 0.1447 to 0.1534, is narrow, suggesting a stable estimate of the correlation coefficient. This interval underscores that, while the correlation is relatively modest, it is consistently different from zero across different samples from the population.

In conclusion, the analysis confirms a statistically significant with positive relationship between temperature and electricity demand. This suggests that higher temperatures may lead to increased electricity usage, possibly due to greater use of cooling systems or other temperature-dependent activities.

3.2.3 Comparison of Forecast Electricity Demand vs Actual Electricity Demand

# Inner join dfDemand & dfForecast
dfDF <- inner_join(dfDemand, dfForecast, by = "DATETIME")
head(dfDF)
## # A tibble: 6 × 13
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 10:00:00       8067.      2010          1        1        10
## 2 2010-01-01 10:30:00       8123.      2010          1        1        10
## 3 2010-01-01 11:00:00       8196.      2010          1        1        11
## 4 2010-01-01 11:30:00       8221.      2010          1        1        11
## 5 2010-01-01 12:00:00       8278.      2010          1        1        12
## 6 2010-01-01 12:30:00       8362.      2010          1        1        12
## # ℹ 7 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>, IsHoliday <chr>, Date <date>, MedianForecastDemand <dbl>
# Select the data from 2015 to 2018 from dfDF
dfDF2015_2018 <- dfDF %>%
  filter(yearValue >= 2015 & yearValue <= 2018)
# Plot using ggplot2 with smoothed lines using dfFD
plot <- ggplot(dfDF2015_2018, aes(x = DATETIME)) +
  geom_smooth(aes(y = MedianForecastDemand, colour = "Median Forecast Demand"), 
              method = "loess", se = FALSE, linewidth = 1) +
  geom_smooth(aes(y = TOTALDEMAND, colour = "Total Demand"), 
              method = "loess", se = FALSE, linewidth = 1) +
  labs(title = "Smoothed Comparison of Median Forecast Demand and Actual Demand from 2015 to 2018",
       x = "Year",
       y = "Demand") +
  scale_colour_manual(values = c("Median Forecast Demand" = "blue", "Total Demand" = "red")) +
  theme_minimal()

# Print the plot
print(plot)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Calculate MAE
maeValue <- mae(dfDF$TOTALDEMAND, dfDF$MedianForecastDemand)
print(paste("Mean Absolute Error (MAE):", maeValue))
## [1] "Mean Absolute Error (MAE): 1412.68754332215"
# Calculate RMSE
rmseValue <- rmse(dfDF$TOTALDEMAND, dfDF$MedianForecastDemand)
print(paste("Root Mean Squared Error (RMSE):", rmseValue))
## [1] "Root Mean Squared Error (RMSE): 1774.70285666052"

MAE = 1412.6: This value suggests that on average, the predictions of the forecasting model deviate from the actual demand by about 1412.6 units.

RMSE = 1774.7: The value of 1774.7 indicates that while many of the predictions may be reasonably accurate, there are notable instances where the forecast deviates significantly from the actual demand. This could be indicative of the model’s sensitivity to data anomalies or its failure to capture extreme fluctuations due to external factors or rare events.

3.2.3.1 Determining Optmal Hour of Delay

calculateRMSE <- function(actual, predicted) {
  sqrt(mean((predicted - actual)^2, na.rm = TRUE))
}
# Define a range of delays to test, for example, from -12 to 12 hours
delays <- seq(-24, 24, by = 1)  

# Initialize a data frame to store the results
results <- data.frame(Delay = integer(), RMSE = numeric())

for (delay in delays) {
  # Shift the forecast data by 'delay' hours
  dfForecastShifted <- dfForecast %>%
    mutate(DATETIME = DATETIME + hours(delay))
  
  # Merge with actual demand data
  dfComparison <- inner_join(dfForecastShifted, dfDemand, by = "DATETIME")
  
  # Calculate RMSE for this delay
  rmse <- calculateRMSE(dfComparison$TOTALDEMAND, dfComparison$MedianForecastDemand)
  
  # Store the results
  results <- rbind(results, data.frame(Delay = delay, RMSE = rmse))
}

# Identify the delay with the minimum RMSE
optimal_delay <- results[which.min(results$RMSE), ]
print(optimal_delay)
##    Delay     RMSE
## 15   -10 230.4831

The optimal number of delay hour is 10.

3.2.3.2 Time Analysis of Error

dfError <- dfComparison %>%
  mutate(
    AbsoluteError = abs(TOTALDEMAND - MedianForecastDemand),  # Mean Absolute Error component
    SquaredError = (TOTALDEMAND - MedianForecastDemand)^2  # Mean Squared Error component
  )
# Plotting Absolute Errors over Time
ggplot(dfError, aes(x = DATETIME, y = AbsoluteError)) +
  geom_line() +
  labs(title = "Absolute Errors Over Time", x = "Year", y = "Absolute Error") +
  theme_minimal()

# Plotting Squared Errors over Time
ggplot(dfError, aes(x = DATETIME, y = SquaredError)) +
  geom_line() +
  labs(title = "Squared Errors Over Time", x = "Datetime", y = "Squared Error") +
  theme_minimal()

dfErrorOutlier <- dfError %>%
  mutate(
    IQR = IQR(AbsoluteError, na.rm = TRUE),  # Calculate IQR
    UpperBound = quantile(AbsoluteError, 0.75, na.rm = TRUE) + 1.5 * IQR,
    LowerBound = quantile(AbsoluteError, 0.25, na.rm = TRUE) - 1.5 * IQR,
    Outlier = AbsoluteError > UpperBound | AbsoluteError < LowerBound  # Identify outliers
  )
# Create groups of consecutive outliers
dfErrorOutlier <- dfErrorOutlier %>%
  mutate(
    SquaredError = abs(TOTALDEMAND - MedianForecastDemand),
    Outlier = SquaredError > (quantile(SquaredError, 0.75, na.rm = TRUE) + 1.5 * IQR(SquaredError, na.rm = TRUE))
  ) %>%
  arrange(DATETIME) %>%
  mutate(
    GroupChange = Outlier != lag(Outlier, default = first(Outlier)),
    Group = cumsum(GroupChange)
  )

# Getting intervals for outliers
outlierInterval <- dfErrorOutlier %>%
  filter(Outlier) %>%
  group_by(Group) %>%
  summarise(
    Start = min(DATETIME),
    End = max(DATETIME)
  ) %>%
  ungroup()

# Plotting Squared Errors over Time with Outlier intervals highlighted
plot <- ggplot() +
  geom_rect(data = outlierInterval, aes(xmin = Start, xmax = End, ymin = -Inf, ymax = Inf), fill = "red", alpha = 0.2) +
  geom_line(data = dfErrorOutlier, aes(x = DATETIME, y = SquaredError), color = "gray", alpha = 0.5) +
  geom_point(data = dfErrorOutlier, aes(x = DATETIME, y = SquaredError, color = Outlier), linewidth = 1, alpha = 0.8) +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
  labs(title = "Squared Errors Over Time with Outliers Highlighted") +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold")  # Center and bold the title
  ) +
  geom_vline(data = data.frame(x = seq(from = floor_date(min(dfErrorOutlier$DATETIME), "year"),
                                       to = ceiling_date(max(dfErrorOutlier$DATETIME), "year"),
                                       by = "1 year")),
             aes(xintercept = as.numeric(x)), linetype = "dashed", color = "blue") +
  scale_x_datetime(breaks = seq(from = floor_date(min(dfErrorOutlier$DATETIME), "year"),
                                to = ceiling_date(max(dfErrorOutlier$DATETIME), "year"),
                                by = "1 year"),
                   labels = scales::date_format("%Y"))  # Formatting labels to show only the year part
## Warning in geom_point(data = dfErrorOutlier, aes(x = DATETIME, y =
## SquaredError, : Ignoring unknown parameters: `linewidth`
print(plot)

3.2.4 STL Decomposition

# Convert the dataframe to a tsibble object
dfDemandTS <- dfDemand %>%
  select(DATETIME, TOTALDEMAND) %>% 
  as_tsibble(index = DATETIME) 

decomposed <- dfDemandTS %>%
  model(STL = STL(TOTALDEMAND ~ season(window = "periodic"),
                  robust = TRUE)) %>%
  components()

decomposed %>% autoplot()

3.2.5 Wavelet Decomposition

# Perform a discrete wavelet transform (DWT)
totalDemand <- as.numeric(dfDemand$TOTALDEMAND)

# Perform a discrete wavelet transform (DWT)
# Choose 'haar' as the wavelet filter and specify the number of decomposition levels (n.levels)
dwtResult <- dwt(totalDemand, filter = "haar", n.levels = 4, boundary = "periodic")

# Reconstruct the time series from wavelet coefficients
reconstructed <- idwt(dwtResult)

# Create a data frame for plotting
plotData <- data.frame(Index = seq_along(totalDemand),
                       Original = totalDemand,
                       Reconstructed = reconstructed)

# Plot 
ggplot(plotData) +
  geom_point(aes(x = Index, y = Original), color = "blue", alpha = 0.5) +
  geom_point(aes(x = Index, y = Reconstructed), color = "red", alpha = 0.5) +
  labs(title = "Original vs Reconstructed Series",
       x = "Index",
       y = "Value") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

3.3 Correlation Heatmap

3.3.1 Electricity Demand

head(dfDemand)
## # A tibble: 6 × 12
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00       8038       2010          1        1         0
## 2 2010-01-01 00:30:00       7809.      2010          1        1         0
## 3 2010-01-01 01:00:00       7484.      2010          1        1         1
## 4 2010-01-01 01:30:00       7117.      2010          1        1         1
## 5 2010-01-01 02:00:00       6812.      2010          1        1         2
## 6 2010-01-01 02:30:00       6544.      2010          1        1         2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>, IsHoliday <chr>, Date <date>

3.3.2 Convert “IsHoliday” from chr type into binary type, 1 and 0

dfDemand <- dfDemand %>%
  mutate(IsHoliday = ifelse(IsHoliday == "Holiday", 1, 0))
head(dfDemand)
## # A tibble: 6 × 12
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00       8038       2010          1        1         0
## 2 2010-01-01 00:30:00       7809.      2010          1        1         0
## 3 2010-01-01 01:00:00       7484.      2010          1        1         1
## 4 2010-01-01 01:30:00       7117.      2010          1        1         1
## 5 2010-01-01 02:00:00       6812.      2010          1        1         2
## 6 2010-01-01 02:30:00       6544.      2010          1        1         2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>, IsHoliday <dbl>, Date <date>
dfCorHM <- dfDemand %>% 
  select(-DATETIME, -Date, -timeOfDay, -minuteValue) %>% 
  mutate(across(everything(), as.numeric))

corMatrix <- cor(dfCorHM)

corrplot(corMatrix, method = "color", type = "full", order = "hclust",
         tl.col = "black", tl.srt = 45, 
         addCoef.col = "black") 

# Extracting the correlation of each variable with TOTALDEMAND
totalDemandCor <- corMatrix["TOTALDEMAND", ]
totalDemandCor <- totalDemandCor[order(-abs(totalDemandCor))]  # Sorting by absolute value

# Removing TOTALDEMAND's self-correlation
totalDemandCor <- totalDemandCor[totalDemandCor != 1]

# Viewing the ranked correlations
print(totalDemandCor)
##    hourValue    yearValue   monthValue    IsHoliday    dayOfWeek  weekOfMonth 
##  0.476948947 -0.209345123 -0.122574678 -0.121197507  0.043732681  0.005078356 
##     dayValue 
##  0.004951010

3.4 Check for Stationary by Augmented Dickey-Fuller Test

# Extract TOTALDEMAND for modeling
totalDemand <- dfDemandTS %>% select(TOTALDEMAND) %>% unlist()

tsData <- ts(totalDemand, frequency = 336) # 48 half-hours per day * 7 days = 336 half-hours per week
adfTest <- adf.test(tsData, alternative = "stationary")
print(adfTest)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  tsData
## Dickey-Fuller = -2.1127, Lag order = 73, p-value = 0.5309
## alternative hypothesis: stationary

3.4.1 Apply Regular First Differencing

# First difference
tsDataDiff <- diff(tsData, differences = 1)

# Perform ADF test again
adfTestDiff <- adf.test(tsDataDiff, alternative = "stationary")
## Warning in adf.test(tsDataDiff, alternative = "stationary"): p-value smaller
## than printed p-value
print(adfTestDiff)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  tsDataDiff
## Dickey-Fuller = -72.871, Lag order = 73, p-value = 0.01
## alternative hypothesis: stationary

The results from the Augmented Dickey-Fuller (ADF) test on the differenced data (tsDataDiff) indicate that after applying first differencing, the time series has become stationary. The test statistic is significantly negative (-72.871), and the p-value is 0.01, which is well below the conventional threshold of 0.05. This suggests you can reject the null hypothesis of non-stationarity. The warning about the p-value being smaller than printed indicates that the true p-value is very small, reinforcing the conclusion that the series is stationary

3.4.2 Determine the orders of the autoregressive (AR) and moving average (MA) components for the ARIMA model.

3.4.3 ACF Plot for Daily Seasonality

dfDemandTS %>% 
  mutate(demandDiff = difference(TOTALDEMAND, 48)) %>% 
  ACF(demandDiff) %>% 
  autoplot()

The ACF drops to 0 at lag =

3.4.4 ACF Plot of Daily and Weekly Seaonality

dfDemandTS %>% 
  mutate(demandDiff = difference(TOTALDEMAND, 48)) %>% 
  ACF(demandDiff, lag_max = 336) %>% 
  autoplot()

3.4.5 PACF Plot for Daily Seasonality

dfDemandTS %>% 
  mutate(demandDiff = difference(TOTALDEMAND, 48)) %>% 
  PACF(demandDiff) %>% 
  autoplot()

### PACF Plot for Daily and Weekly Seasonality

dfDemandTS %>% 
  mutate(demandDiff = difference(TOTALDEMAND, 48)) %>% 
  PACF(demandDiff, lag_max = 336) %>% 
  autoplot()

### PACF Plot for Daily and Monthly Seasonality

dfDemandTS %>% 
  mutate(demandDiff = difference(TOTALDEMAND, 48)) %>% 
  PACF(demandDiff, lag_max = 1440) %>% 
  autoplot()

# Check for duplicates
duplicates <- duplicated(dfDemand)

# View the rows that are duplicates
dfDemand[duplicates, ]
## # A tibble: 0 × 12
## # ℹ 12 variables: DATETIME <dttm>, TOTALDEMAND <dbl>, yearValue <dbl>,
## #   monthValue <dbl>, dayValue <int>, hourValue <int>, minuteValue <int>,
## #   timeOfDay <dbl>, weekOfMonth <fct>, dayOfWeek <ord>, IsHoliday <dbl>,
## #   Date <date>

4 Modellings

4.1 Findings from EDA

  • First differencing makes data stationary
  • Daily, Weekly Seasonality Detected
  • Optimal hours of delay = 10
  • Variables with Absolute correlation over 0.1, hourValue, yearValue, monthValue, IsHoliday

4.2 Data Preparation for Modelling

head(dfDemand)
## # A tibble: 6 × 12
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00       8038       2010          1        1         0
## 2 2010-01-01 00:30:00       7809.      2010          1        1         0
## 3 2010-01-01 01:00:00       7484.      2010          1        1         1
## 4 2010-01-01 01:30:00       7117.      2010          1        1         1
## 5 2010-01-01 02:00:00       6812.      2010          1        1         2
## 6 2010-01-01 02:30:00       6544.      2010          1        1         2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>, IsHoliday <dbl>, Date <date>
head(dfTemp)
## # A tibble: 6 × 10
##   DATETIME            TEMPERATURE yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00        23.1      2010          1        1         0
## 2 2010-01-01 00:30:00        22.9      2010          1        1         0
## 3 2010-01-01 01:00:00        22.6      2010          1        1         1
## 4 2010-01-01 01:30:00        22.5      2010          1        1         1
## 5 2010-01-01 02:00:00        22.5      2010          1        1         2
## 6 2010-01-01 02:30:00        22.4      2010          1        1         2
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>

4.2.0.1 Select DATETIME, TEMPERATURE from dfTemp

# Prepare dfTemp with DATETIME, TEMPERATURE
dfTemp <- dfTemp %>% 
  select(c(DATETIME, TEMPERATURE))
head(dfTemp)
## # A tibble: 6 × 2
##   DATETIME            TEMPERATURE
##   <dttm>                    <dbl>
## 1 2010-01-01 00:00:00        23.1
## 2 2010-01-01 00:30:00        22.9
## 3 2010-01-01 01:00:00        22.6
## 4 2010-01-01 01:30:00        22.5
## 5 2010-01-01 02:00:00        22.5
## 6 2010-01-01 02:30:00        22.4
# Check for duplicates
duplicates <- duplicated(dfTemp)

# View the rows that are duplicates
dfTemp[duplicates, ]
## # A tibble: 13 × 2
##    DATETIME            TEMPERATURE
##    <dttm>                    <dbl>
##  1 2011-01-01 00:00:00        21  
##  2 2011-10-10 10:30:00        18.9
##  3 2011-10-10 18:30:00        16.1
##  4 2011-10-10 19:30:00        15.5
##  5 2012-01-01 00:00:00        15.4
##  6 2013-01-01 00:00:00        21  
##  7 2014-01-01 00:00:00        20.4
##  8 2015-01-01 00:00:00        20.9
##  9 2016-01-01 00:00:00        16.9
## 10 2017-01-01 00:00:00        22.6
## 11 2018-01-01 00:00:00        22.4
## 12 2019-01-01 00:00:00        22.3
## 13 2020-01-01 00:00:00        19.4

4.2.0.2 Merge dfDemand, dfTemp by DATETIME into dfDT

# Merge dfDemand and dfTemp by DATETIME
dfDT <- left_join(dfDemand, dfTemp, by = 'DATETIME')
head(dfDT)
## # A tibble: 6 × 13
##   DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##   <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
## 1 2010-01-01 00:00:00       8038       2010          1        1         0
## 2 2010-01-01 00:30:00       7809.      2010          1        1         0
## 3 2010-01-01 01:00:00       7484.      2010          1        1         1
## 4 2010-01-01 01:30:00       7117.      2010          1        1         1
## 5 2010-01-01 02:00:00       6812.      2010          1        1         2
## 6 2010-01-01 02:30:00       6544.      2010          1        1         2
## # ℹ 7 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>, IsHoliday <dbl>, Date <date>, TEMPERATURE <dbl>
# Check for duplicates
duplicates <- duplicated(dfDT)

# View the rows that are duplicates
dfDT[duplicates, ]
## # A tibble: 13 × 13
##    DATETIME            TOTALDEMAND yearValue monthValue dayValue hourValue
##    <dttm>                    <dbl>     <dbl>      <dbl>    <int>     <int>
##  1 2011-01-01 00:00:00       8063.      2011          1        1         0
##  2 2011-10-10 10:30:00       9039.      2011         10       10        10
##  3 2011-10-10 18:30:00       9221.      2011         10       10        18
##  4 2011-10-10 19:30:00       9021.      2011         10       10        19
##  5 2012-01-01 00:00:00       7079.      2012          1        1         0
##  6 2013-01-01 00:00:00       7359.      2013          1        1         0
##  7 2014-01-01 00:00:00       7010.      2014          1        1         0
##  8 2015-01-01 00:00:00       7058.      2015          1        1         0
##  9 2016-01-01 00:00:00       7140.      2016          1        1         0
## 10 2017-01-01 00:00:00       7431.      2017          1        1         0
## 11 2018-01-01 00:00:00       7035.      2018          1        1         0
## 12 2019-01-01 00:00:00       7613.      2019          1        1         0
## 13 2020-01-01 00:00:00       7319.      2020          1        1         0
## # ℹ 7 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## #   dayOfWeek <ord>, IsHoliday <dbl>, Date <date>, TEMPERATURE <dbl>

4.2.0.3 Count NA

countNA(dfDT)
## True
## Total number of NA values: 579

4.2.0.4 Fill NA

dfDT$TEMPERATURE <- na.spline(dfDT$TEMPERATURE)

4.2.0.5 select DATETIME, TOTALDEMAND, TEMPERATURE, hourValue, yearValue, monthValue, IsHoliday from dfModel to dfModel

dfModel <- dfDT %>% 
  select(DATETIME, TOTALDEMAND, TEMPERATURE, hourValue, yearValue, monthValue, IsHoliday)
head(dfModel)
## # A tibble: 6 × 7
##   DATETIME            TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
##   <dttm>                    <dbl>       <dbl>     <int>     <dbl>      <dbl>
## 1 2010-01-01 00:00:00       8038         23.1         0      2010          1
## 2 2010-01-01 00:30:00       7809.        22.9         0      2010          1
## 3 2010-01-01 01:00:00       7484.        22.6         1      2010          1
## 4 2010-01-01 01:30:00       7117.        22.5         1      2010          1
## 5 2010-01-01 02:00:00       6812.        22.5         2      2010          1
## 6 2010-01-01 02:30:00       6544.        22.4         2      2010          1
## # ℹ 1 more variable: IsHoliday <dbl>

4.2.0.6 Check NA values

# Check NA
countNA(dfModel)
## False
## No NA values present.

4.2.0.7 Check Duplicate Values

# Check for duplicates
duplicates <- duplicated(dfModel)

# View the rows that are duplicates
dfModel[duplicates, ]
## # A tibble: 13 × 7
##    DATETIME            TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
##    <dttm>                    <dbl>       <dbl>     <int>     <dbl>      <dbl>
##  1 2011-01-01 00:00:00       8063.        21           0      2011          1
##  2 2011-10-10 10:30:00       9039.        18.9        10      2011         10
##  3 2011-10-10 18:30:00       9221.        16.1        18      2011         10
##  4 2011-10-10 19:30:00       9021.        15.5        19      2011         10
##  5 2012-01-01 00:00:00       7079.        15.4         0      2012          1
##  6 2013-01-01 00:00:00       7359.        21           0      2013          1
##  7 2014-01-01 00:00:00       7010.        20.4         0      2014          1
##  8 2015-01-01 00:00:00       7058.        20.9         0      2015          1
##  9 2016-01-01 00:00:00       7140.        16.9         0      2016          1
## 10 2017-01-01 00:00:00       7431.        22.6         0      2017          1
## 11 2018-01-01 00:00:00       7035.        22.4         0      2018          1
## 12 2019-01-01 00:00:00       7613.        22.3         0      2019          1
## 13 2020-01-01 00:00:00       7319.        19.4         0      2020          1
## # ℹ 1 more variable: IsHoliday <dbl>

4.2.0.8 Remove 13 Duplicate values

# Remove duplicates
dfModel <- dfModel[!duplicated(dfModel), ]

# View the first few rows of the data frame without duplicates
head(dfModel)
## # A tibble: 6 × 7
##   DATETIME            TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
##   <dttm>                    <dbl>       <dbl>     <int>     <dbl>      <dbl>
## 1 2010-01-01 00:00:00       8038         23.1         0      2010          1
## 2 2010-01-01 00:30:00       7809.        22.9         0      2010          1
## 3 2010-01-01 01:00:00       7484.        22.6         1      2010          1
## 4 2010-01-01 01:30:00       7117.        22.5         1      2010          1
## 5 2010-01-01 02:00:00       6812.        22.5         2      2010          1
## 6 2010-01-01 02:30:00       6544.        22.4         2      2010          1
## # ℹ 1 more variable: IsHoliday <dbl>

4.2.0.9 Confirm No Duplicate in dfModel

# Check for duplicates
duplicates <- duplicated(dfModel)

# View the rows that are duplicates
dfModel[duplicates, ]
## # A tibble: 0 × 7
## # ℹ 7 variables: DATETIME <dttm>, TOTALDEMAND <dbl>, TEMPERATURE <dbl>,
## #   hourValue <int>, yearValue <dbl>, monthValue <dbl>, IsHoliday <dbl>
str(dfModel)
## tibble [196,513 × 7] (S3: tbl_df/tbl/data.frame)
##  $ DATETIME   : POSIXct[1:196513], format: "2010-01-01 00:00:00" "2010-01-01 00:30:00" ...
##  $ TOTALDEMAND: num [1:196513] 8038 7809 7484 7117 6812 ...
##  $ TEMPERATURE: num [1:196513] 23.1 22.9 22.6 22.5 22.5 22.4 22.3 22.3 22.1 22.2 ...
##  $ hourValue  : int [1:196513] 0 0 1 1 2 2 3 3 4 4 ...
##  $ yearValue  : num [1:196513] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ monthValue : num [1:196513] 1 1 1 1 1 1 1 1 1 1 ...
##  $ IsHoliday  : num [1:196513] 1 1 1 1 1 1 1 1 1 1 ...

4.2.1 Filter Last 5 Years Data

# Calculate the start date for the last 5 years from the last available date in the dataset
endDate <- max(dfModel$DATETIME) - years(3)
startDate <- endDate - years(5)

# Filter the dataset to include only data from the last 5 years
df5Yr <- dfModel %>%
  filter(DATETIME >= startDate & DATETIME <= endDate)

4.2.2 Training and Testing Dataset Spliting in 70/30

# Calculate the number of rows to include in the training set
trainSize <- floor(0.7 * nrow(df5Yr))

# Create the training and testing datasets
trainSet <- df5Yr[1:trainSize, ]
testSet <- df5Yr[(trainSize + 1):nrow(df5Yr), ]
print(testSet)
## # A tibble: 26,295 × 7
##    DATETIME            TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
##    <dttm>                    <dbl>       <dbl>     <int>     <dbl>      <dbl>
##  1 2016-09-16 05:00:00       6228.        13.7         5      2016          9
##  2 2016-09-16 05:30:00       6478.        13.5         5      2016          9
##  3 2016-09-16 06:00:00       6823.        12.7         6      2016          9
##  4 2016-09-16 06:30:00       7473.        13.2         6      2016          9
##  5 2016-09-16 07:00:00       7968.        13.8         7      2016          9
##  6 2016-09-16 07:30:00       8269.        13.4         7      2016          9
##  7 2016-09-16 08:00:00       8461.        13.4         8      2016          9
##  8 2016-09-16 08:30:00       8393.        14.6         8      2016          9
##  9 2016-09-16 09:00:00       8204.        16.2         9      2016          9
## 10 2016-09-16 09:30:00       8083.        17.2         9      2016          9
## # ℹ 26,285 more rows
## # ℹ 1 more variable: IsHoliday <dbl>
print(trainSet)
## # A tibble: 61,354 × 7
##    DATETIME            TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
##    <dttm>                    <dbl>       <dbl>     <int>     <dbl>      <dbl>
##  1 2013-03-18 00:00:00       6653.        16.4         0      2013          3
##  2 2013-03-18 00:30:00       6544.        15.5         0      2013          3
##  3 2013-03-18 01:00:00       6375.        15.3         1      2013          3
##  4 2013-03-18 01:30:00       6192.        14.9         1      2013          3
##  5 2013-03-18 02:00:00       6027.        14.4         2      2013          3
##  6 2013-03-18 02:30:00       5902.        14           2      2013          3
##  7 2013-03-18 03:00:00       5845.        13.9         3      2013          3
##  8 2013-03-18 03:30:00       5861.        13.8         3      2013          3
##  9 2013-03-18 04:00:00       5929.        13.6         4      2013          3
## 10 2013-03-18 04:30:00       6210.        13.3         4      2013          3
## # ℹ 61,344 more rows
## # ℹ 1 more variable: IsHoliday <dbl>
cat("Number of observations in the training set:", nrow(trainSet), "\n")
## Number of observations in the training set: 61354
cat("Number of observations in the testing set:", nrow(testSet), "\n")
## Number of observations in the testing set: 26295

4.2.2.1 Convert trainSet to tsibble

tsTrainSet <- trainSet %>% 
  as_tsibble(index = DATETIME)

4.2.2.2 Fill Gap Values

# Fill gaps in the time series
tsTrainSet <- tsTrainSet %>%
  fill_gaps(TOTALDEMAND = NA)  # Fill Gap with NA
print(nrow(tsTrainSet))
## [1] 61354

4.2.2.3 Convert testSet to tsibble

tsTestSet <- testSet %>%
  as_tsibble(index = DATETIME)

4.2.2.4 Fill Gap Values

# Fill gaps in the time series
tsTestSet <- tsTestSet %>%
  fill_gaps(TOTALDEMAND = NA)  # Fill Gap values with NA
print(nrow(tsTestSet))
## [1] 26295

4.2.3 Model Training

# Train SARIMA model with first order differencing and Fourier daily seasonality
sarimaD1FrDModel <- tsTrainSet %>% 
  model(
    sarimaD1FrD = ARIMA(TOTALDEMAND ~ pdq(0,1,0) + 
                        PDQ(0,0,0) + 
                        fourier(K=3, period=48))
      
  )
# Train SARIMA model with first order differencing, Fourier daily and weekly seasonality 
sarimD1FrDWModel <- tsTrainSet %>% 
  model(
    sarimD1FrDW = ARIMA(TOTALDEMAND ~ pdq(0,1,0) + 
                        PDQ(0,0,0) + 
                        fourier(K=3, period=48) +
                        fourier(K=3, period=48*7))
  )
# Train SARIMA model with first order differencing, Fourier daily and weekly seasonality and TEMPERATURE
sarimD1FrDWTempModel <- tsTrainSet %>% 
  model(
    sarimD1FrDWTemp = ARIMA(TOTALDEMAND ~ pdq(0,1,0) + 
                            PDQ(0,0,0) +
                            fourier(K=3, period=48) +
                            fourier(K=3, period=48*7) + 
                            TEMPERATURE)
  )

4.2.4 AIC BIC

sarimaD1FrDModel %>% glance(fit) %>% arrange(BIC) %>% select(.model:BIC)
## # A tibble: 1 × 6
##   .model      sigma2  log_lik     AIC    AICc     BIC
##   <chr>        <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
## 1 sarimaD1FrD 21032. -392401. 784816. 784816. 784879.
sarimD1FrDWModel %>% glance(fit) %>% arrange(BIC) %>% select(.model:BIC)
## # A tibble: 1 × 6
##   .model      sigma2  log_lik     AIC    AICc     BIC
##   <chr>        <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
## 1 sarimD1FrDW 20950. -392278. 784581. 784581. 784699.
sarimD1FrDWTempModel %>% glance(fit) %>% arrange(BIC) %>% select(.model:BIC)
## # A tibble: 1 × 6
##   .model          sigma2  log_lik     AIC    AICc     BIC
##   <chr>            <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
## 1 sarimD1FrDWTemp 20614. -391782. 783591. 783591. 783717.

4.2.5 Models Performance with Test Set

4.2.5.1 Define function for MAE and RMSE

calFcAcc <- function(fcFable, actualTs) {
  # Prepare forecast data
  avgFc <- fcFable %>%
    as_tibble() %>%
    select(DATETIME, .mean)
  
  # SJoin forecast data with actual data
  accData <- inner_join(avgFc, actualTs, by = "DATETIME")
  
  # Calculate MAE
  mae <- mean(abs(accData$.mean - accData$TOTALDEMAND), na.rm = TRUE)

  # Calculate RMSE
  rmse <- sqrt(mean((accData$.mean - accData$TOTALDEMAND)^2, na.rm = TRUE))
  cat("Mean Absolute Error (MAE):", mae, "\n")
  cat("Root Mean Squared Error (RMSE):", rmse, "\n")
}

4.2.6 Calculate Accuracy

# Forecasting using the model with first order differencing and fourier daily seasonality
sarimaD1FrDFc <- sarimaD1FrDModel %>%
  forecast(new_data = tsTestSet)
# Print the forecast results
calFcAcc(sarimaD1FrDFc, tsTestSet)
## Mean Absolute Error (MAE): 712.7173 
## Root Mean Squared Error (RMSE): 988.6672
# Forecasting using the model with first order differencing, daily and weekly seasonality using Fourier terms
sarimD1FrDWFc <- sarimD1FrDWModel %>%
  forecast(new_data = tsTestSet)

# Print the forecast results
calFcAcc(sarimD1FrDWFc, tsTestSet)
## Mean Absolute Error (MAE): 725.3482 
## Root Mean Squared Error (RMSE): 1034.277
# Forecasting using the model with first order differencing, daily seasonality, Fourier weekly seasonality, and temperature
sarimD1FrDWTempFc <- sarimD1FrDWTempModel %>%
  forecast(new_data = tsTestSet)

# Print the forecast results
calFcAcc(sarimD1FrDWTempFc, tsTestSet)
## Mean Absolute Error (MAE): 750.4085 
## Root Mean Squared Error (RMSE): 1076.07

4.2.7 Plot Model Forecast Demand vs Actual Demand

4.2.7.1 Define Function to convert Forecast from fable to tibble

fableTb <- function(fable){
  tb <- fable %>% 
  as_tibble() %>%
  select(DATETIME, .mean)
  return(tb)
}
head(tsTestSet)
## # A tsibble: 6 x 7 [30m] <Australia/Brisbane>
##   DATETIME            TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
##   <dttm>                    <dbl>       <dbl>     <int>     <dbl>      <dbl>
## 1 2016-09-16 05:00:00       6228.        13.7         5      2016          9
## 2 2016-09-16 05:30:00       6478.        13.5         5      2016          9
## 3 2016-09-16 06:00:00       6823.        12.7         6      2016          9
## 4 2016-09-16 06:30:00       7473.        13.2         6      2016          9
## 5 2016-09-16 07:00:00       7968.        13.8         7      2016          9
## 6 2016-09-16 07:30:00       8269.        13.4         7      2016          9
## # ℹ 1 more variable: IsHoliday <dbl>
# Join the datasets by DateTime
tbSarimaD1FrDFc <- fableTb(sarimaD1FrDFc) %>% 
  rename(sarimaD1FrD_mean = .mean)
tbSarimD1FrDWFc <- fableTb(sarimD1FrDWFc) %>% 
  rename(sarimD1FrDW_mean = .mean)
tbSarimD1FrDWTempFc <- fableTb(sarimD1FrDWTempFc) %>%
  rename(sarimD1FrDWTemp_mean = .mean)

joinData <- reduce(list(tbSarimaD1FrDFc, tbSarimD1FrDWFc, tbSarimD1FrDWTempFc, tsTestSet), full_join, by = "DATETIME")

# Plot the data
plot <- ggplot(joinData, aes(x = DATETIME)) +
  geom_smooth(aes(y = TOTALDEMAND, color = "Actual Demand"), method = "loess", se = FALSE, linewidth = 1) +
  geom_smooth(aes(y = sarimaD1FrD_mean, color = "SarimaD1FrD Forecast"), method = "loess", se = FALSE, linewidth = 1) +
  geom_smooth(aes(y = sarimD1FrDW_mean, color = "SarimD1FrDW Forecast"), method = "loess", se = FALSE, linewidth = 1) +
  geom_smooth(aes(y = sarimD1FrDWTemp_mean, color = "SarimD1FrDWTemp Forecast"), method = "loess", se = FALSE, linewidth = 1) +
  labs(title = "Comparison of Actual Demand vs Forecasts",
       x = "DateTime",
       y = "Demand",
       color = "Legend") +
  theme_minimal() +
  scale_color_manual(values = c("Actual Demand" = "black", 
                                "SarimaD1FrD Forecast" = "blue", 
                                "SarimD1FrDW Forecast" = "red", 
                                "SarimD1FrDWTemp Forecast" = "green"))

# Print the plot
print(plot)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'